### FIGURE 4 ###
library(ggplot2)
library(RColorBrewer) # Colour
library(dplyr) # Dataframe process
library(tidyverse)
library(ggrepel)
library(cowplot) # Join plots
library("ggpubr") # Statistics to the figures
library(ggsignif)
library(ggpp)
library(rstatix)

# PANEL A
# Loading data 
dfa<- read.table(file="matrix_90_ab_ml_1prcst_nored100_cl.tsv",header=T,sep="\t", row.names = 2) 
dfa$X <- NULL 

# Count total number of prophages
dfa <- dfa %>% mutate(total_phages = rowSums(select(.,1:351)))
dfa[dfa$types == "",]$types <- 'None'

### subset for types #### 
Cas<- dfa[grepl("Cas",dfa$types),]
Cas$types ='Cas'
RM <- dfa[grepl("R-M",dfa$types),]
RM$types = 'R-M'                                                                                     
CBASS <-dfa[grepl("CBASS",dfa$types),]
CBASS$types = 'CBASS'  
Septu <- dfa[grepl("Septu",dfa$types),]
Septu$types = 'Septu'  
SspBCDE <-dfa[grepl("SspBCDE",dfa$types),]
SspBCDE$types = 'SspBCDE'  
Gao_Qat <- dfa[grepl("Gao_Qat",dfa$types),]
Gao_Qat$types = 'Gao_Qat'  
PDT4 <- dfa[grepl("PD-T4-5",dfa$types),]
PDT4$types = 'PD-T4' 
PDT7 <- dfa[grepl("PD-T7-5",dfa$types),]
PDT7$types ='PD-T7'
Gabija <- dfa[grepl("Gabija",dfa$types),]
Gabija$types = "Gabija"
RTA <- dfa[grepl("RosmerTA",dfa$types),]
RTA$types = "RosmerTA"
None <- dfa[grepl("None",dfa$types),]


# Join the subsets in a single dataframe to build the figure
df_box <- rbind( SspBCDE,Gao_Qat,RM, RTA, PDT4,PDT7,Cas,CBASS, Gabija,Septu, None)
df_box <- df_box[,355-357]
# Sorting
types_sorted <- c("SspBCDE","Gao_Qat","R-M","RosmerTA","PD-T4","PD-T7","Cas","CBASS","Gabija","Septu","None")
counts <- df_box %>% group_by(types) %>% summarise(n = n()) 

# Calculate statistics
reference_data <- df_box %>% filter(types=="None") %>% pull(total_phages) # Reference group against which comparisons are to be made

# Check normality
print(shapiro.test(None$total_phages))
print(shapiro.test(SspBCDE$total_phages))
print(shapiro.test(Gao_Qat$total_phages))
print(shapiro.test(RM$total_phages))
print(shapiro.test(RTA$total_phages))
print(shapiro.test(PDT4$total_phages))
print(shapiro.test(PDT7$total_phages))
print(shapiro.test(Cas$total_phages))
print(shapiro.test(CBASS$total_phages))
print(shapiro.test(Gabija$total_phages))
print(shapiro.test(Septu$total_phages))

results <- list()
for (type in unique(df_box$types)){
  if (type != "None"){
    group_data <- df_box %>% filter(types == type) %>% pull(total_phages)
    test <- wilcox.test(reference_data, group_data)  # Perform Mann-Whitney U test (Wilcoxon rank-sum test) if type 
    
    # Store the results
    results[[type]] <- list(
      group = type,
      p_value = test$p.value,
      method = test$method
    )
  }
}

# Adjust p-values for multiple comparisons using Bonferroni correction
adjusted_p_values <- p.adjust(sapply(results, function(x) x$p_value), method = "bonferroni")

# Create a data frame for annotations
annotations <- data.frame(
  group1 = "None",
  group2 = names(results),
  p_value = adjusted_p_values
)

# Add significance levels to annotations
annotations$significance <- cut(annotations$p_value, breaks = c(0, 0.001, 0.01, 0.05, 1), 
                                labels = c("***", "**", "*", "ns"))

# Draw the plot
A <- ggplot(df_box,aes(x = factor(types,level=types_sorted), y = total_phages, fill = types)) +
  geom_violin(show.legend = FALSE) + geom_boxplot(width=0.1,show.legend=FALSE)+geom_text(data=counts, aes(x= factor(types,level=types_sorted), y = Inf, label=paste0("n = ",n)), vjust = c(20,7,10,5,20,10,10,5,8,12,5), hjust= 0.5,size =4.1) +
  labs(x = "Defense systems", y= "No. of prophages") + theme_minimal() + scale_y_continuous(limits= c(0,13))+
  theme(axis.title = element_text(size= 13), axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))+
  geom_signif(comparisons = list(c("None", "SspBCDE"),
                                 c("None", "Gao_Qat"),
                                 c("None", "R-M"),
                                 c("None","RosmerTA"),
                                 c("None","PD-T4"),
                                 c("None","PD-T7"),
                                 c("None","Cas"),
                                 c("None","CBASS"),
                                 c("None","Gabija"),
                                 c("None","Septu")),annotations = annotations$significance, map_signif_level = TRUE, textsize = 6,vjust = 0, y_position = c(12,11.5,11,10.5,10,9.5,9,8.5,8,7.5)) + scale_fill_manual(values=c("#FFC1C1","#8DD3C7","#EE9A00","#FDB462","#696969","#A6D854","#7CCD7C","#BC808D","#EEDC82","#4876FF","#FB8072"))


# PANEL B
library("ggplotify")
library("grid")
library("gridExtra")
library("stringr")
library(gtable)

# Loadingg data
dfb<- read.table(file="matrix_90_ab_ml_1prcst_nored100_cl_mlst.tsv",header=T,sep="\t",row.names=2) # Using only freq MLSTs
dfb$X<- NULL 

# Calculate total number of prophages
dfb <- dfb %>% mutate(total_phages = rowSums(select(.,1:351)))
dfb$types[dfb$types=="nan"] <- 'None'
dfb$types <- str_replace(dfb$types, "CAS", "Cas")

df_box <- dfb[,352:354]

setwd("/home/brown/Documentos/defensome_aba/mlst_defsys_pres_aus/")
# Initialize a vector to store p-values
all_p_values <- c()
test_info <- data.frame(test_id = character(), m = character(), t = character(), p_value = numeric())

mlst_list <- read_lines("./MLST_list_freq.id") # List of freq MLSTs
data_long = data.frame()
data_text_long = data.frame()
for (m in mlst_list){
  print(m)
  path <- paste0("./",m,".ab")
  st <- read_lines(path)
  df_box_mlst <- df_box[rownames(df_box) %in% st,]
  
  types_list <- c("R-M","RosmerTA","Gabija","Cas","Gao_Qat","CBASS","SspBCDE","PD-T4-5","PD-T7-5")

  for (t in types_list){
    print(t)
   
    pres <- df_box_mlst[grepl(t,df_box_mlst$types),]
    pres$strains <- rownames(pres)
    rownames(pres) <- NULL
    aus <- df_box_mlst[!grepl(t,df_box_mlst$types),]
    aus$strains <- rownames(aus)
    rownames(aus) <- NULL
    if (nrow(pres) == 0 && nrow(aus) == 0){
      print("There is no enough Aus/Pres data")
    }
    else if (nrow(pres) != 0 && nrow(aus) == 0){
      pres$Pres = "Defense system presence"
      pres$types <- t
      pres$mlst <- m
      data_long <- rbind(data_long,pres)
    }
    else if (nrow(pres) == 0 && nrow(aus) != 0){
      aus$Pres = "Defense system absence"
      aus$types <- t
      aus$mlst <- m
      data_long <- rbind(data_long,aus)
    }
    else{
      pres$Pres = "Defense system presence"
      aus$Pres = "Defense system absence"
      print(c("pres=",nrow(pres),"aus=",nrow(aus))) # Print the Ns of presence/absence 
      wilcox_t<- wilcox.test(pres$total_phages, aus$total_phages, alternative = "two.sided") # Do the statistical test
      all_p_values <- c(all_p_values, wilcox_t$p.value) # Store p-values
      
      # keep track of test information for clarity
      test_info <- rbind(test_info, data.frame(test_id = paste(m, t, sep = "_"), m = m, t = t, p_value = wilcox_t$p.value))
      
      print(wilcox_t$p.value) # Print p-value
      data_box <- rbind(pres,aus)
      data_box$types <- t
      data_box$mlst <- m
      data_long<- rbind(data_long,data_box)
      
      data_text <- data_box %>% group_by(Pres) %>% tally()
      data_text$types <- t
      data_text$mlst <- m
      data_text$signif <- ifelse(wilcox_t$p.value < 0.05, T, F)
      data_text_long <- rbind(data_text_long,data_text)
        
      }
    
    }
  
}

# Draw plot
B <- ggplot()+
  geom_boxplot(data=data_long, aes(x=Pres, y=total_phages, fill=Pres),position = position_dodge(width= 0.1, preserve = "total"),width = .4, show.legend = TRUE)+ scale_fill_manual(values = c("#FF7256","#BCEE68")) + scale_y_continuous(breaks = c(2,4,6), limits=c(0,6))+
  labs( y = "No. of prophages") +
 facet_grid(factor(data_long$mlst, levels = c("ST2","ST79","ST1","ST3","ST499","ST10","ST78","ST25")) ~ types, scales="free",space="free", as.table = TRUE) + theme_minimal_grid() + theme(strip.background= element_rect(fill = "gray"),axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank(), legend.position = "bottom" , legend.title = element_blank()) 

B + annotate("rect")


# PANEL C & D
library(circlize) # Plot circos
library(gridGraphics)

setwd("/home/brown/Documentos/defensome_aba/circos")

par(mfcol=c(2,3)) # Parameters to build the plot grid
f <- list()

# Colors
grid_col = c("P1012" = "#FF4040", "P1179"="#FF6A6A", "P1151"="#CD5B45", "P1054"="#FFA07A", "P1251"="#EE2C2C", "P1371"="#FF7F50",
             "DgiS1"="#E0EEEE","P1017"="#7AC5CD","P1055"="darkslategray1","P1231"="#1874CD","P1233"="#B0E2FF","P1311"="#B0C4DE","P1003"="#3A5FCD","P1023"="#00C5CD",
             "P1059"="#BF3EFF","P1114"="mediumpurple4","P1031"="mediumorchid1","P1183"="mediumpurple1","P1107"="#68228B","P1115"="#68228B","P1310"="#DDA0DD",
             "P1050"="lightgoldenrod2","P1009"="#FFD700","P1041"="#FFF68F","P1011"="#FFA500","P1002"="#DAA520", "P1240" ="rosybrown2", "P1471"="#EEAEEE", "P1075"="#FFB6C1", "P1105"="palevioletred2", "P1339"="#CD8C95",
             "P18"="#BCEE68","P2140"="olivedrab1","P1035"="#9AFF9A", "P1003"="#CDAA7D","P1068"="#F4A460","P1184"="#DEB887","P1074"="#CD853F","P1372"="#FFD39B","P1331"="#CD853F","P1076"="#D2B48C","P1165"="#8B5A2B","P1140"="#FF8C00","P1161"="#FF7F24","P1255"="#EE7600","P1491"="#CD661D","P1661"="#FFA07A","P1167"="#FF8247",
             "Cas,Gao_Qat,Gabija"="darkgoldenrod1","R-M,Cas,Gabija,RosmerTA"="#FFC1C1","R-M,Cas,Gao_Qat,Gabija"="#EEC591","R-M,Cas,Gao_Qat,Gabija,CBASS,RosmerTA"="lightsteelblue3","R-M,Cas,Gao_Qat,Gabija,RosmerTA"="#EE6363","R-M,Gabija,RosmerTA"="palegreen3","R-M,Gao_Qat,Gabija,RosmerTA"="#68228B",
             "SspBCDE"="#EE6363","SspBCDE,PD-T4,PD-T7"="palegreen3","SspBCDE,Gao_Qat"="darkgoldenrod1","SspBCDE,Gao_Qat,PD-T4,PD-T7"="mediumpurple1",
             "Cas,RosmerTA,PD-T4"="#FFC1C1","R-M,Cas,RosmerTA,PD-T4"="palegreen3","R-M,Cas,RosmerTA"="darkgoldenrod1","R-M,PD-Lambda-2"="palegreen3",
             "R-M"="#68228B","R-M,SspBCDE"="lightsteelblue3","R-M,Cas"="#CDB5CD", "R-M,RosmerTA" = "#EEC591", "R-M,CBASS,RosmerTA"="lightsteelblue3","R-M,PD-T4,PD-T7"="mediumpurple1","PD-T4,PD-T7"="palegreen3","R-M,CBASS,RosmerTA,PD-T4,PD-T7"="#EE6363",
             "R-M,CBASS"="lightsteelblue3","R-M,CBASS,PD-T7"="#EE6363","R-M,Gao_Qat,CBASS"="darkgoldenrod1"
)

# Draw the plot into a loop
for (st in c(1, 2, 25, 3, '00', '0cas')) {
  if (st %in% c(1,2,25,3)){
    for_circos <- read.csv(paste0("ds_ph_circos_st", st, "_2.tsv"), header = T, sep = '\t') # Loading data - Panel C
  }else{
    for_circos <- read.csv(paste0("ds_ph_circos_st", st, "_1.tsv"), header = T, sep = '\t') # Loading data - Panel D
  }
  
  chordDiagram(for_circos,grid.col = grid_col,transparency = 0.6) # Draw the plot
  
}

circos.clear()

# Build the legends for each circos plot - Building  datasets
st1 <- data.frame(comb=c("Cas,Gao_Qat,Gabija","R-M,Cas,Gabija,RosmerTA","R-M,Cas,Gao_Qat,Gabija","R-M,Cas,Gao_Qat,Gabija,CBASS,RosmerTA","R-M,Cas,Gao_Qat,Gabija,RosmerTA","R-M,Gabija,RosmerTA","R-M,Gao_Qat,Gabija,RosmerTA"),n=rnorm(7))
st2 <- data.frame(comb=c("SspBCDE","SspBCDE,PD-T4,PD-T7","SspBCDE,Gao_Qat","SspBCDE,Gao_Qat,PD-T4,PD-T7"),n=rnorm(4))
st25 <- data.frame(comb=c("Cas,RosmerTA,PD-T4","R-M,Cas,RosmerTA,PD-T4","R-M,Cas,RosmerTA"),n=rnorm(3))
st00 <- data.frame(comb = c("SspBCDE","R-M","R-M,SspBCDE","R-M,Cas"), n=rnorm(4))
st3 <- data.frame(comb=c("R-M","R-M,PD-Lambda-2"),n=rnorm(2))
st499 <- data.frame(comb=c("R-M","R-M,RosmerTA"), n=rnorm(2))
st10 <- data.frame(comb=c("R-M","R-M,CBASS,RosmerTA","R-M,PD-T4,PD-T7","PD-T4,PD-T7","R-M,CBASS,RosmerTA,PD-T4,PD-T7"), n=rnorm(5))
st79 <- data.frame(comb = c("R-M","R-M,Cas","R-M,RosmerTA"), n=rnorm(3))
st78 <- data.frame(comb=c("R-M,CBASS","R-M,CBASS,PD-T7","R-M,Gao_Qat,CBASS"),n=rnorm(3))

# Plotting the legends
st1_gp<-ggplot(st1, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("Cas,Gao_Qat,Gabija"="darkgoldenrod1","R-M,Cas,Gabija,RosmerTA"="#FFC1C1","R-M,Cas,Gao_Qat,Gabija"="#EEC591","R-M,Cas,Gao_Qat,Gabija,CBASS,RosmerTA"="lightsteelblue3","R-M,Cas,Gao_Qat,Gabija,RosmerTA"="#EE6363","R-M,Gabija,RosmerTA"="palegreen3","R-M,Gao_Qat,Gabija,RosmerTA"="#68228B"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=2))
st2_gp<-ggplot(st2, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("SspBCDE"="#EE6363","SspBCDE,PD-T4,PD-T7"="palegreen3","SspBCDE,Gao_Qat"="darkgoldenrod1","SspBCDE,Gao_Qat,PD-T4,PD-T7"="mediumpurple1"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st25_gp<-ggplot(st25, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("Cas,RosmerTA,PD-T4"="#FFC1C1","R-M,Cas,RosmerTA,PD-T4"="palegreen3","R-M,Cas,RosmerTA"="darkgoldenrod1"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st3_gp<-ggplot(st3, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,PD-Lambda-2"="palegreen3","R-M"="#68228B"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st499_gp<-ggplot(st499, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,RosmerTA" = "#EEC591","R-M"="#68228B"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st10_gp<-ggplot(st10, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,CBASS,RosmerTA"="lightsteelblue3","R-M,PD-T4,PD-T7"="mediumpurple1","PD-T4,PD-T7"="palegreen3","R-M,CBASS,RosmerTA,PD-T4,PD-T7"="#EE6363","R-M"="#68228B"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st79_gp<-ggplot(st79, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,Cas"="#CDB5CD","R-M,RosmerTA" = "#EEC591","R-M"="#68228B"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st78_gp<-ggplot(st78, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,CBASS"="lightsteelblue3","R-M,CBASS,PD-T7"="#EE6363","R-M,Gao_Qat,CBASS"="darkgoldenrod1"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))
st00_gp<-ggplot(st00, aes(x=comb,y=n, fill=comb)) + geom_col() + scale_fill_manual(values=c("R-M,SspBCDE"="lightsteelblue3","R-M,Cas"="#CDB5CD","R-M"="#68228B","SspBCDE"="#EE6363"))+
  labs(fill="Combinations") + theme(legend.position = "bottom")+ guides(fill=guide_legend(ncol=3))

lg1 <- get_plot_component(st1_gp, "guide-box-bottom")
lg2 <- get_plot_component(st2_gp, "guide-box-bottom")
lg25 <- get_plot_component(st25_gp, "guide-box-bottom")
lg3 <- get_plot_component(st3_gp, "guide-box-bottom")
lg499 <- get_plot_component(st499_gp, "guide-box-bottom")
lg10 <- get_plot_component(st10_gp, "guide-box-bottom")
lg79 <- get_plot_component(st79_gp, "guide-box-bottom")
lg78 <- get_plot_component(st78_gp, "guide-box-bottom")
lg00 <- get_plot_component(st00_gp, "guide-box-bottom")
ggarrange(lg1,lg2,lg25,lg3,lg499,lg10,lg79,lg78, ncol=1)
